'''
  爬取公众号文章
'''
#1.导入第三方库
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import time,io,sys

#2.设定输出编码简体中文正常显示
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#3.设置requests模拟请求参数，记得修改成自己的fakeid和token
#  请求地址
root_url='https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=1&count=50&fakeid=MzUxNzE5MTMyM111w==&type=9&query=&token=548272228719&lang=zh_CN&f=json&ajax=1'
#  请求头参数,记得修改自己的Cookie和User-Agent
headers = {
    'Cookie': 'appmsglist_action_3272298029=card; noticeLoginFlag=1; pgv_pvi=8638476288; RK=8SvCQphKTp; tvfe_boss_uuid=19abd60d8ea5a3c9; mobileUV=1_15f77d5b4e5_b34d5; pac_uid=1_412308234; ptcz=2ea306f52fbef91bbe56e58396ce6b9898e26894273ea255cc95c40a2608644b; o_cookie=412308234; pgv_pvid=1627028880; ua_id=GA6QewrT2M44sIpbAAAAAFhAdVBZUay93BNjt2UXcJg=; mm_lang=zh_CN; wxuin=76842257318315; ied_qq=o0412308234; ts_uid=1410584387; _ga=GA1.2.500266847.1591521463; noticeLoginFlag=1; _qpsvr_localtk=0.48345851763383574; pgv_si=s4856183808; uuid=8336ec83548895ca4a2619921585c5b5; ticket=84d8e80c50424f819787f97c34a6215af492ff57; ticket_id=gh_c273de42fa9b; cert=naUZ5cQeVyRC9SiQSftnBBnwcSG9YsSy; rand_info=CAESIGCLSkQhga5MX3eRc3qoUOQNr0e6mmqR0bVckDJnLzVs; slave_bizuin=3272298029; data_bizuin=3207372177; bizuin=3272298029; data_ticket=ltTS7010ojAr/qKfJWXHzBb11fK225UkHjg3rpfzHBsKPOJxWwm2+bECbLvr29+z; slave_sid=Y1llVERvd3lWcVhLb2o5VUpPVUlWSVdOUFRkMzJOMzZFWmd3VDM0ZXI2R3VpY0VMUG1EeHF5VHdkTXI5Ull5RTRvVWkwbUFnY01EbVpDWWx3VGJtRXpDdWtkNDMxZjF2U0lZMHNWNTdEd3JJUHYyY2NrTUF3SmV4TnFEQlI4eFZjU2lobWROT0E5bWJoQlZF; slave_user=gh_c273de42fa9b; xid=8108dd9d423d25b9a0f06ba437aa2783; openid2ticket_okZ0sv0SrUKupmq9ehTzUf6wpxB4=KDY0ztCDh2kDfIrIsgPhoIlNuRPKQKO47tfXDp5T/OQ=; rewardsn=; wxtokenkey=777',
    'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13F69 MicroMessenger/6.6.2 NetType/WIFI Language/zh_CN'
}

#4.使用requests获取XHR数据，并保存公众号文章标题和链接地址
def getTitleLink(url):
    r=requests.get(url,headers=headers)
    articleList=r.json()
    print(articleList['app_msg_list'])
    articleTitles=[item['title'] for item in articleList['app_msg_list']]
    articleLinks=[item['link'] for item in articleList['app_msg_list']]
    return articleTitles,articleLinks

#5.使用requests获取每篇文章的详细文字内容
def getContent(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    result = soup.select('#js_content')
    content = [item.text for item in result]
    return content

if __name__ == '__main__':
    titles,links=getTitleLink(root_url)    
    for title,link in zip(titles,links):
        with open(file=title+'.txt',mode='w+',encoding='utf-8') as f:
            for paragraph in getContent(link):
                f.write(paragraph)
                f.write('\n')
        print("爬取完当前文章，等待10s后爬取下一章...")
        time.sleep(10)

    print("大功告成！")